#collapse-hide
# Imports
import os
import pandas as pd
import csv
import kaggle

# other imports
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from copy import copy
import seaborn as sns
from scipy.stats import norm
import matplotlib.dates as mdates
# import matplotlib.colors as mcolors
# import random
# import math
# import time
# from sklearn.linear_model import LinearRegression, BayesianRidge
# from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
# from sklearn.svm import SVR
from datetime import date, datetime
from dateutil.parser import parse
import us
# import operator 
# plt.style.use('fivethirtyeight')
import plotly.graph_objects as go
from plotly.subplots import make_subplots
%matplotlib inline 
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:143: FutureWarning: The sklearn.utils.testing module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.utils. Anything that cannot be imported from sklearn.utils is now part of the private API.
  warnings.warn(message, FutureWarning)

Covid Tracking Dataset (w/ hospitalised data)

Source: https://covidtracking.com/

Load and Clean the Data

#collapse-hide
all_cases = pd.read_csv('https://covidtracking.com/api/v1/states/daily.csv')

# Delete unecessary rows
for row in ['negative', 'pending', 'hash', 'negativeIncrease', 'totalTestResults', 'totalTestResultsIncrease', 'dateChecked', 'fips', 'inIcuCumulative', 'onVentilatorCumulative', 'total', 'posNeg', 'deathIncrease', 'hospitalizedIncrease', 'positiveIncrease']:
    del all_cases[row]

# TODO missing values
#      Do we get avg or missing values, or predict them?
#      See https://developerzen.com/data-mining-handling-missing-values-the-database-bd2241882e72

for i, row in all_cases.iterrows():
    # Set Dates
    s = str(row['date'])
    all_cases.at[i, 'date'] = date(year=int(s[0:4]), month=int(s[4:6]), day=int(s[6:8]))

# Missing death figures means no death reports yet
# These are set to 0
for i, row in all_cases.iterrows():
    if np.isnan(row['death']):
        all_cases.at[i, 'death'] = 0

Missing values: Retrieving from other datasets or through merging columns (or both)

The following will be done:

  • Active Cases: Retrieved from JHU dataset and calculating $active = pos-dead-recovered$
  • Beds per State: Retrieved from External Datasets

#collapse-hide
# TODO Replace active cases with JHU and/or regression model (Selma)
all_cases['active'] = all_cases['positive'] - all_cases['recovered'] - all_cases['death']
# change location of 'active' column
cols = list(all_cases)
cols.insert(3, cols.pop(cols.index('active')))
all_cases = all_cases.loc[:, cols]

#collapse-hide
# Load datasets for US population and Hospital beds per 1000
us_population = pd.read_csv('data/us_population.csv')
hosp_beds = pd.read_csv('data/hospital_beds.csv')
state_abbrev = pd.read_csv('data/us_state_names.csv')

# add state abbreviations to us_population and hospital beds dataframe
for state in state_abbrev['State'].tolist():
    # store state abbreviation in variable
    abbrev = state_abbrev.loc[state_abbrev['State'] == state, 'Abbreviation'].tolist()[0]
    # add abbrev to new column 'Abbreviation' in us_population df
    us_population.loc[us_population['State'] == state, 'Abbreviation'] = abbrev
    # add abbrev to new column in hosp_beds df
    hosp_beds.loc[hosp_beds['Location'] == state, 'Abbreviation'] = abbrev
    
# change order of columns of us_population
cols = list(us_population)
cols.insert(2, cols.pop(cols.index('Abbreviation')))
us_population = us_population.loc[:, cols]

# drop unnecessary columns of us_population
us_population = us_population.drop(columns=['rank', 'Growth', 'Pop2018', 'Pop2010', 'growthSince2010', 'Percent', 'density'])

# drop unnecessary columns of hosp_beds
hosp_beds = hosp_beds.drop(columns=['Location', 'State/Local Government', 'Non-Profit', 'For-Profit'])

# change order of columns of hosp_beds
cols = list(hosp_beds)
cols.insert(0, cols.pop(cols.index('Abbreviation')))
hosp_beds = hosp_beds.loc[:, cols]
us_population.head()
State Abbreviation Pop
0 Alabama AL 4908621
1 Alaska AK 734002
2 Arizona AZ 7378494
3 Arkansas AR 3038999
4 California CA 39937489
hosp_beds.head()
Abbreviation Total
0 NaN 2.4
1 AL 3.1
2 AK 2.2
3 AZ 1.9
4 AR 3.2

#collapse-hide
# filter out non-existing states like 'AS'
all_cases = all_cases[all_cases['state'].isin(state_abbrev['Abbreviation'].tolist())]
# see what filtered dataframe looks like
all_cases.head()
date state positive active hospitalizedCurrently hospitalizedCumulative inIcuCurrently onVentilatorCurrently recovered dataQualityGrade ... totalTestsViral positiveTestsViral negativeTestsViral positiveCasesViral commercialScore negativeRegularScore negativeScore positiveScore score grade
0 2020-07-06 AK 1138.0 574.0 19.0 NaN NaN 3.0 548.0 A ... 123753.0 NaN NaN NaN 0 0 0 0 0 NaN
1 2020-07-06 AL 44878.0 21789.0 1016.0 2914.0 NaN NaN 22082.0 B ... NaN NaN NaN 44375.0 0 0 0 0 0 NaN
2 2020-07-06 AR 24253.0 6127.0 337.0 1575.0 NaN 81.0 17834.0 A ... 344529.0 NaN NaN 24253.0 0 0 0 0 0 NaN
4 2020-07-06 AZ 101441.0 87795.0 3212.0 5188.0 839.0 533.0 11836.0 A+ ... 617343.0 NaN 100943.0 NaN 0 0 0 0 0 NaN
5 2020-07-06 CA 271684.0 NaN 7278.0 NaN 1937.0 NaN NaN B ... 4793353.0 NaN NaN 271684.0 0 0 0 0 0 NaN

5 rows × 25 columns

#collapse-hide
# Split dataframes by date
df_split_by_date = dict(tuple(all_cases.groupby('date')))

# Split dataframes by state
df_split_by_state = dict(tuple(all_cases.groupby('state')))

# merge dataframes us_population and all_cases
df_merge_uspop = all_cases.merge(us_population, how='left', left_on='state', right_on='Abbreviation')
df_merge_uspop = df_merge_uspop.drop(columns=['Abbreviation'])
df_merge_uspop = df_merge_uspop.rename(columns={'Pop': 'population'})

# change location of 'population' column
cols = list(df_merge_uspop)
cols.insert(2, cols.pop(cols.index('population')))
df_merge_uspop = df_merge_uspop.loc[:, cols]

# merge dataframes hosp_beds and df_merge_uspop
df_merge_hosp = df_merge_uspop.merge(hosp_beds, how='left', left_on='state', right_on='Abbreviation')
df_merge_hosp = df_merge_hosp.drop(columns=['Abbreviation'])
all_cases = df_merge_hosp.rename(columns={'Total': 'bedsPerThousand'})
all_cases.head()
date state population positive active hospitalizedCurrently hospitalizedCumulative inIcuCurrently onVentilatorCurrently recovered ... negativeTestsViral positiveCasesViral commercialScore negativeRegularScore negativeScore positiveScore score grade State bedsPerThousand
0 2020-07-06 AK 734002 1138.0 574.0 19.0 NaN NaN 3.0 548.0 ... NaN NaN 0 0 0 0 0 NaN Alaska 2.2
1 2020-07-06 AL 4908621 44878.0 21789.0 1016.0 2914.0 NaN NaN 22082.0 ... NaN 44375.0 0 0 0 0 0 NaN Alabama 3.1
2 2020-07-06 AR 3038999 24253.0 6127.0 337.0 1575.0 NaN 81.0 17834.0 ... NaN 24253.0 0 0 0 0 0 NaN Arkansas 3.2
3 2020-07-06 AZ 7378494 101441.0 87795.0 3212.0 5188.0 839.0 533.0 11836.0 ... 100943.0 NaN 0 0 0 0 0 NaN Arizona 1.9
4 2020-07-06 CA 39937489 271684.0 NaN 7278.0 NaN 1937.0 NaN NaN ... NaN 271684.0 0 0 0 0 0 NaN California 1.8

5 rows × 28 columns

#collapse-hide
# Calculate the total beds, and add the column
all_cases['total_beds'] = all_cases['population'] / 1000 * all_cases['bedsPerThousand']
# change abbreviations to state names
all_cases = all_cases.rename(columns={'state': 'abbrev'})
all_cases = all_cases.rename(columns={'State': 'state'})                             
# change location of 'state' column
cols = list(all_cases)
cols.insert(1, cols.pop(cols.index('state')))
all_cases = all_cases.loc[:, cols]
all_cases.head()
date state abbrev population positive active hospitalizedCurrently hospitalizedCumulative inIcuCurrently onVentilatorCurrently ... negativeTestsViral positiveCasesViral commercialScore negativeRegularScore negativeScore positiveScore score grade bedsPerThousand total_beds
0 2020-07-06 Alaska AK 734002 1138.0 574.0 19.0 NaN NaN 3.0 ... NaN NaN 0 0 0 0 0 NaN 2.2 1614.8044
1 2020-07-06 Alabama AL 4908621 44878.0 21789.0 1016.0 2914.0 NaN NaN ... NaN 44375.0 0 0 0 0 0 NaN 3.1 15216.7251
2 2020-07-06 Arkansas AR 3038999 24253.0 6127.0 337.0 1575.0 NaN 81.0 ... NaN 24253.0 0 0 0 0 0 NaN 3.2 9724.7968
3 2020-07-06 Arizona AZ 7378494 101441.0 87795.0 3212.0 5188.0 839.0 533.0 ... 100943.0 NaN 0 0 0 0 0 NaN 1.9 14019.1386
4 2020-07-06 California CA 39937489 271684.0 NaN 7278.0 NaN 1937.0 NaN ... NaN 271684.0 0 0 0 0 0 NaN 1.8 71887.4802

5 rows × 29 columns

  • Load and clean JHU data
  • Merge JHU dataset with main dataset

#collapse-hide
# This cell takes some time, as it needs to connect to Kaggle Servers to retrieve data
kaggle.api.authenticate()
kaggle.api.dataset_download_files('benhamner/jhucovid19', path='./kaggle/input/jhucovid19/', unzip=True)

#collapse-hide
# Get Time-Series Data of cases as Pandas DataFrame
dir_jhu = './kaggle/input/jhucovid19/csse_covid_19_data/csse_covid_19_daily_reports'

df_list = []
for dirname, _, files in os.walk(dir_jhu):
    for file in files:
        if 'gitignore' not in file and 'README' not in file:
            full_dir = os.path.join(dirname, file)
            df_list.append(pd.read_csv(full_dir))
            
jhu_df = pd.concat(df_list, axis=0, ignore_index=True, sort=True)

# convert Last Update columns to datetime format
jhu_df.loc[:, 'Last Update'] = pd.to_datetime(jhu_df['Last Update']).apply(lambda x: x.date())
jhu_df.loc[:, 'Last_Update'] = pd.to_datetime(jhu_df['Last_Update']).apply(lambda x: x.date())

# Combine Last Update with Last_Update
jhu_df['LastUpdate'] = jhu_df['Last_Update'].combine_first(jhu_df['Last Update'])

# Combine Country/Region with Country_Region
jhu_df['CountryRegion'] = jhu_df['Country/Region'].combine_first(jhu_df['Country_Region'])

# Retrieve only US data
jhu_df = jhu_df[jhu_df['CountryRegion']=='US']

# Combine Province/State with Province_State
jhu_df['ProvinceState'] = jhu_df['Province/State'].combine_first(jhu_df['Province_State'])

# Drop unnecessary columns
jhu_df = jhu_df.drop(['Admin2', 'Lat', 'Latitude', 'Long_', 'Longitude', 'Combined_Key', 'Country/Region',
                      'Country_Region', 'Province/State', 'Province_State',
                      'Last Update', 'Last_Update', 'FIPS'], axis=1)

# Change column order
cols = list(jhu_df)
cols.insert(0, cols.pop(cols.index('CountryRegion')))
cols.insert(1, cols.pop(cols.index('ProvinceState')))
cols.insert(2, cols.pop(cols.index('LastUpdate')))
jhu_df = jhu_df.loc[:, cols]

# Change region to known US states
state_abbrs_dict = {}
for state in us.states.STATES:
    state_abbrs_dict[state.abbr] = state.name

def toState(input_state, mapping):
    abbreviation = input_state.rstrip()[-2:]
    try:
        return_value = mapping[abbreviation]
    except KeyError:
        return_value = input_state
    return return_value

jhu_df['ProvinceState'] = jhu_df['ProvinceState'].apply(lambda x: toState(x, state_abbrs_dict) if x != 'Washington, D.C.' else 'District of Columbia')

# Filter out unknown states
jhu_df = jhu_df[jhu_df['ProvinceState'].isin(all_cases.state.unique().tolist())]

# Merge-sum rows with same date and State
jhu_df = jhu_df.groupby(['LastUpdate', 'ProvinceState']).agg(
    {
        'Active': sum,
        'Confirmed': sum,
        'Deaths': sum,
        'Recovered': sum
    }
).reset_index()

jhu_df.tail()
LastUpdate ProvinceState Active Confirmed Deaths Recovered
5802 2020-07-01 Virginia 61024.0 62787.0 1763.0 0.0
5803 2020-07-01 Washington 31492.0 32824.0 1332.0 0.0
5804 2020-07-01 West Virginia 2812.0 2905.0 93.0 0.0
5805 2020-07-01 Wisconsin 27875.0 28659.0 784.0 0.0
5806 2020-07-01 Wyoming 1467.0 1487.0 20.0 0.0

#collapse-hide
# Now that we have the JHU dataset relatively cleaned
# we can go ahead and merge its data with our main dataset

for i, row in all_cases.iterrows():
    last_update = all_cases.at[i, 'date']
    state = all_cases.at[i, 'state']
    matching_row = jhu_df[jhu_df['ProvinceState'] == state]
    matching_row = matching_row[matching_row['LastUpdate'] == last_update].reset_index()

    if len(matching_row.values) > 0:
        #all_cases.at[i, 'positive'] = matching_row['Confirmed'].values[0]
        all_cases.at[i, 'active'] = matching_row['Active'].values[0]
        #all_cases.at[i, 'recovered'] = matching_row['Recovered'].values[0]   --- JHU was inconsistent, therefore removed
        #all_cases.at[i, 'death'] = matching_row['Deaths'].values[0]

    # Replace unknown recovery numbers with 0
    if np.isnan(row['recovered']):
        all_cases.at[i, 'recovered'] = 0

    if all_cases.at[i, 'active'] == 0 or np.isnan(row['active']):
        positive = all_cases.at[i, 'positive']
        recovered = all_cases.at[i, 'recovered']
        dead = all_cases.at[i, 'death']
        all_cases.at[i, 'active'] = positive - recovered - dead

all_cases.tail()
date state abbrev population positive active hospitalizedCurrently hospitalizedCumulative inIcuCurrently onVentilatorCurrently ... negativeTestsViral positiveCasesViral commercialScore negativeRegularScore negativeScore positiveScore score grade bedsPerThousand total_beds
6335 2020-01-26 Washington WA 7797095 2.0 2.0 NaN NaN NaN NaN ... NaN NaN 0 0 0 0 0 NaN 1.7 13255.0615
6336 2020-01-25 Washington WA 7797095 2.0 2.0 NaN NaN NaN NaN ... NaN NaN 0 0 0 0 0 NaN 1.7 13255.0615
6337 2020-01-24 Washington WA 7797095 2.0 2.0 NaN NaN NaN NaN ... NaN NaN 0 0 0 0 0 NaN 1.7 13255.0615
6338 2020-01-23 Washington WA 7797095 2.0 2.0 NaN NaN NaN NaN ... NaN NaN 0 0 0 0 0 NaN 1.7 13255.0615
6339 2020-01-22 Washington WA 7797095 2.0 2.0 NaN NaN NaN NaN ... NaN NaN 0 0 0 0 0 NaN 1.7 13255.0615

5 rows × 29 columns

#collapse-hide
# Save formatted dataset offline in case of disaster
dataset_file = 'results/all_cases.csv'
all_cases.to_csv(dataset_file)

#collapse-hide
# convert date to datetime format
all_cases['date'] = pd.to_datetime(all_cases['date'])

An Exploratory data analysis of the US dataset

Basic triad of the dataset: validating data types and data integrity of each row

#collapse-hide
dataset_file = 'results/all_cases.csv'
covid_df = pd.read_csv(dataset_file, index_col=0) 
# convert date to datetime format
covid_df['date'] = pd.to_datetime(covid_df['date'])
covid_df.info()
# set float format to 3 decimals
pd.set_option('display.float_format', lambda x: '%.3f' % x)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6340 entries, 0 to 6339
Data columns (total 29 columns):
date                      6340 non-null datetime64[ns]
state                     6340 non-null object
abbrev                    6340 non-null object
population                6340 non-null int64
positive                  6340 non-null float64
active                    6340 non-null float64
hospitalizedCurrently     4201 non-null float64
hospitalizedCumulative    3522 non-null float64
inIcuCurrently            2130 non-null float64
onVentilatorCurrently     1858 non-null float64
recovered                 6340 non-null float64
dataQualityGrade          5406 non-null object
lastUpdateEt              5985 non-null object
dateModified              5985 non-null object
checkTimeEt               5985 non-null object
death                     6340 non-null float64
hospitalized              3522 non-null float64
totalTestsViral           1787 non-null float64
positiveTestsViral        615 non-null float64
negativeTestsViral        624 non-null float64
positiveCasesViral        3480 non-null float64
commercialScore           6340 non-null int64
negativeRegularScore      6340 non-null int64
negativeScore             6340 non-null int64
positiveScore             6340 non-null int64
score                     6340 non-null int64
grade                     0 non-null float64
bedsPerThousand           6340 non-null float64
total_beds                6340 non-null float64
dtypes: datetime64[ns](1), float64(16), int64(6), object(6)
memory usage: 1.5+ MB
covid_df.head()
date state abbrev population positive active hospitalizedCurrently hospitalizedCumulative inIcuCurrently onVentilatorCurrently ... negativeTestsViral positiveCasesViral commercialScore negativeRegularScore negativeScore positiveScore score grade bedsPerThousand total_beds
0 2020-07-06 Alaska AK 734002 1138.000 574.000 19.000 nan nan 3.000 ... nan nan 0 0 0 0 0 nan 2.200 1614.804
1 2020-07-06 Alabama AL 4908621 44878.000 21789.000 1016.000 2914.000 nan nan ... nan 44375.000 0 0 0 0 0 nan 3.100 15216.725
2 2020-07-06 Arkansas AR 3038999 24253.000 6127.000 337.000 1575.000 nan 81.000 ... nan 24253.000 0 0 0 0 0 nan 3.200 9724.797
3 2020-07-06 Arizona AZ 7378494 101441.000 87795.000 3212.000 5188.000 839.000 533.000 ... 100943.000 nan 0 0 0 0 0 nan 1.900 14019.139
4 2020-07-06 California CA 39937489 271684.000 265347.000 7278.000 nan 1937.000 nan ... nan 271684.000 0 0 0 0 0 nan 1.800 71887.480

5 rows × 29 columns

The NaN values may indicate that there were no to few Covid-19 patients at these date points. We further analyse the statistical values of the dataset columns to ensure data integrity and accuracy.

covid_df.describe()
# TODO rounding up the numbers
population positive active hospitalizedCurrently hospitalizedCumulative inIcuCurrently onVentilatorCurrently recovered death hospitalized ... negativeTestsViral positiveCasesViral commercialScore negativeRegularScore negativeScore positiveScore score grade bedsPerThousand total_beds
count 6340.000 6340.000 6340.000 4201.000 3522.000 2130.000 1858.000 6340.000 6340.000 3522.000 ... 624.000 3480.000 6340.000 6340.000 6340.000 6340.000 6340.000 0.000 6340.000 6340.000
mean 6539970.930 23267.198 21026.928 970.345 4570.831 426.305 211.508 5713.764 1182.194 4570.831 ... 335040.093 34427.189 0.000 0.000 0.000 0.000 0.000 nan 2.626 15802.628
std 7386428.876 49472.806 45144.177 1854.582 13131.257 692.299 316.334 13840.533 3037.581 13131.257 ... 440322.631 58870.244 0.000 0.000 0.000 0.000 0.000 nan 0.744 16158.667
min 567025.000 0.000 0.000 1.000 0.000 1.000 0.000 0.000 0.000 0.000 ... 17.000 0.000 0.000 0.000 0.000 0.000 0.000 nan 1.600 1318.928
25% 1778070.000 742.750 655.750 103.000 242.000 76.000 34.000 0.000 16.000 242.000 ... 54633.000 5364.000 0.000 0.000 0.000 0.000 0.000 nan 2.100 3773.952
50% 4499692.000 5857.000 5367.000 393.000 1069.500 165.500 91.000 391.000 172.000 1069.500 ... 165316.000 14990.000 0.000 0.000 0.000 0.000 0.000 nan 2.500 11557.920
75% 7797095.000 23877.750 20932.250 927.000 3556.500 434.750 231.000 3856.000 880.250 3556.500 ... 439626.750 38193.000 0.000 0.000 0.000 0.000 0.000 nan 3.100 19124.737
max 39937489.000 397649.000 553611.000 18825.000 89995.000 5225.000 2425.000 103782.000 24913.000 89995.000 ... 2438200.000 397649.000 0.000 0.000 0.000 0.000 0.000 nan 4.800 71887.480

8 rows × 22 columns

#collapse-hide
# drop unnecessary columns
covid_cleaned = covid_df.drop(['hospitalized', 'bedsPerThousand'], axis=1)
covid_100k = covid_cleaned.copy()
# list of columns to transform to per 100k
columns_list = ['positive', 'active', 'recovered', 'death', 'hospitalizedCurrently', 'hospitalizedCumulative', 'inIcuCurrently', 'onVentilatorCurrently', 'total_beds']
# add columns per 100k
for column in columns_list:
    if column == 'total_beds':
        covid_100k['BedsPer100k'.format(column)] = (covid_cleaned.loc[:, column] / covid_cleaned.loc[:, 'population']) * 100000
    else:
        covid_100k['{}_100k'.format(column)] = (covid_cleaned.loc[:, column] / covid_cleaned.loc[:, 'population']) * 100000

covid_100k = covid_100k.drop(columns_list, axis=1)

#collapse-hide
covid_100k['date'] = pd.to_datetime(covid_100k['date'])
start_date = '2020-04-18'
end_date = '2020-05-19'
mask = (covid_100k['date'] > start_date) & (covid_100k['date'] <= end_date)
covid_100k_last_month = covid_100k.loc[mask]

#collapse-hide
covid_100k_last_month_part1 =  covid_100k_last_month.groupby('date').sum().loc[:, ['positive_100k','active_100k','recovered_100k','death_100k','hospitalizedCumulative_100k']].diff(periods=1, axis=0)

covid_100k_last_month_part2 = covid_100k_last_month.groupby('date').sum().loc[:, ['inIcuCurrently_100k','onVentilatorCurrently_100k','BedsPer100k']]

final_100k_last_month = covid_100k_last_month_part1.merge(covid_100k_last_month_part2, left_index=True, right_index=True)
final_100k_last_month.head()
positive_100k active_100k recovered_100k death_100k hospitalizedCumulative_100k inIcuCurrently_100k onVentilatorCurrently_100k BedsPer100k
date
2020-04-19 nan nan nan nan nan 152.818 80.717 13440.000
2020-04-20 413.759 391.692 35.481 25.728 22.652 155.542 79.710 13440.000
2020-04-21 387.394 360.446 65.218 30.520 31.446 164.605 78.603 13440.000
2020-04-22 428.601 989.954 412.625 28.780 36.181 165.884 78.032 13440.000
2020-04-23 452.031 -2213.482 72.921 26.282 28.842 164.122 94.521 13440.000
final_100k_last_month.describe()
positive_100k active_100k recovered_100k death_100k hospitalizedCumulative_100k inIcuCurrently_100k onVentilatorCurrently_100k BedsPer100k
count 30.000 30.000 30.000 30.000 30.000 31.000 31.000 31.000
mean 399.188 364.943 147.172 23.063 39.160 139.595 73.503 13440.000
std 58.939 634.169 81.341 6.102 43.524 17.123 8.141 0.000
min 287.019 -2213.482 35.481 13.053 9.507 109.602 61.622 13440.000
25% 348.980 314.204 80.563 17.951 22.991 126.370 66.261 13440.000
50% 405.026 366.234 127.774 24.119 28.295 140.327 74.706 13440.000
75% 432.647 419.664 212.491 26.243 32.754 151.795 79.157 13440.000
max 544.349 2291.210 412.625 33.917 246.371 165.884 94.521 13440.000

#collapse-hide
# save description cleaned dataset to csv
describe_file = 'results/final_100k_last_month.csv'
final_100k_last_month.describe().to_csv(describe_file)

Graphical Exploratory Analysis

Plotting histograms, scatterplots and boxplots to assess the distribution of the entire US dataset.

#collapse-hide

# Omitting the categorical (states/abbreviations) and time columns 
# There must be an easier way for you, but this was the easiest way I could think of
covid_cleaned['date'] = pd.to_datetime(covid_cleaned['date'])
# mask data for last month
start_date = '2020-04-18'
end_date = '2020-05-19'
mask = (covid_cleaned['date'] > start_date) & (covid_cleaned['date'] <= end_date)
covid_cleaned_last_month = covid_cleaned.loc[mask]
plot_df = covid_cleaned_last_month[['population', 'active', 'recovered', 'death', 'hospitalizedCurrently', 'inIcuCurrently', 'onVentilatorCurrently', 'total_beds']]
plot_df_last_month = covid_100k_last_month[['population', 'active_100k', 'recovered_100k', 'death_100k', 'hospitalizedCurrently_100k', 'inIcuCurrently_100k', 'onVentilatorCurrently_100k', 'BedsPer100k']]

#collapse-hide

timeseries_usa_df = covid_100k.loc[:, ['date', 'positive_100k', 'active_100k', 'recovered_100k', 'death_100k', 'hospitalizedCurrently_100k', 'inIcuCurrently_100k', 'onVentilatorCurrently_100k', 'BedsPer100k']].groupby('date').sum().reset_index()
# timeseries_usa_df['log_positive'] = np.log(timeseries_usa_df['positive_100k'])
# timeseries_usa_df['log_active'] = np.log(timeseries_usa_df['active_100k'])
# timeseries_usa_df['log_recovered'] = np.log(timeseries_usa_df['recovered_100k'])
# timeseries_usa_df['log_death'] = np.log(timeseries_usa_df['death_100k'])
timeseries_usa_df.tail()
date positive_100k active_100k recovered_100k death_100k hospitalizedCurrently_100k inIcuCurrently_100k onVentilatorCurrently_100k BedsPer100k
162 2020-07-02 37608.249 20257.997 15719.395 1630.857 452.273 69.727 35.282 13440.000
163 2020-07-03 38289.967 20841.847 15808.317 1639.803 451.256 71.286 34.451 13440.000
164 2020-07-04 38895.595 21281.080 15970.189 1644.326 453.454 70.500 33.091 13440.000
165 2020-07-05 39405.292 21677.832 16080.428 1647.031 457.637 71.160 34.296 13440.000
166 2020-07-06 39955.634 21875.414 16429.634 1650.586 465.880 70.154 35.229 13440.000

#collapse-hide

# get data from last day
# plot_df_last_date = plot_df.loc[covid_df['date'] == '2020-05-18'] 

# Plotting histograms to gain insight of the distribution shape, skewness and scale
fig, axs = plt.subplots(4,2,figsize = (16, 16))
sns.set()
for i, column in enumerate(plot_df_last_month.columns):
    if (i + 1) % 2 == 0:
        ax = axs[(i//2), 1]
    else:
        ax = axs[(i//2), 0]
    sns.distplot(plot_df_last_month[column], fit=norm, fit_kws=dict(label='normality'), hist_kws=dict(color='plum', edgecolor='k', linewidth=1, label='frequency'), ax=ax, color='#9d53ad')
    ax.legend(loc='upper right')
plt.tight_layout()
fig.subplots_adjust(top=0.95)

#collapse-hide

# Looking at linearity and variance with scatterplots
# Removing the target variable and saving it in another df
target = plot_df.hospitalizedCurrently
indep_var = plot_df.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (16, 16))
for i, col in enumerate(indep_var.columns):
    ax=fig.add_subplot(4, 3, i+1) 
    sns.regplot(x=indep_var[col], y=target, data=indep_var, label=col, scatter_kws={'s':10}, line_kws={"color": "plum", 'label': 'hospitCurr'})
    plt.suptitle('Scatterplots with Target Hospitalized Patients Showing Growth Trajectories', fontsize=23)
    plt.legend()
plt.tight_layout()
fig.subplots_adjust(top=0.95)

#collapse-hide

# Assessing the normality of the distribution with a boxplot
# Boxplot with removed outliers
fig, ax = plt.subplots(figsize = (16, 12))
for i, col in enumerate(plot_df.columns):
    ax=fig.add_subplot(4, 3, i+1) 
    sns.boxplot(x=plot_df[col], data=plot_df, color='lightblue', showfliers=False)
    plt.suptitle('Boxplots of Independent Variables', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

#collapse-hide

# get data from last day
plot_df_last_date = plot_df.loc[covid_df['date'] == '2020-05-18'] 

fig, ax = plt.subplots(figsize = (16, 12))
for i, col in enumerate(plot_df_last_date.columns):
    ax=fig.add_subplot(4, 3, i+1) 
    sns.boxplot(x=plot_df_last_date[col], data=plot_df, color='lightblue', showfliers=True)
    plt.suptitle('Boxplots of Independent Variables', fontsize=23)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

Analysis of Hospitalizations by State

Alabama

C:\Users\Doctor Gomez\AppData\Roaming\Python\Python37\site-packages\pandas\plotting\_converter.py:129: FutureWarning:

Using an implicitly registered datetime converter for a matplotlib plotting method. The converter was registered by pandas on import. Future versions of pandas will require you to explicitly register matplotlib converters.

To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Arizona

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, '% Positive Cases in Hospital')
Text(0, 0.5, 'No. Patients')

Arkansas

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')

California

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')

Colorado

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')
Text(0, 0.5, 'No. Killed')

Connecticut

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')
Text(0, 0.5, 'No. Killed')

Delaware

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Florida

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
# TODO fix legend/axis/plot alltogether
# Timeseries plt
fig, ax = plt.subplots(figsize = (16, 12))
plt.plot(fl.date, fl.positiveTestsViral, linewidth=4.7, color='r')
plt.title('Cummulative Number of Positive Viral Tests in Florida', fontsize=23)
plt.xlabel('Date')
plt.ylabel('No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, '% Infected')

Georgia

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, '% Infection Rate')

Hawaii

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')
Text(0, 0.5, 'No. Killed')
Text(0, 0.5, 'No. Killed')
Text(0, 0.5, 'No. Killed')
Text(0, 0.5, '% Infected')

Idaho

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')
Text(0, 0.5, 'No. Patients')

Iowa

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Kansas

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')

Kentucky

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')

Louisiana

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Maine

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Maryland

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Massachusetts

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Michigan

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Minnesota

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Mississippi

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Missouri

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Montana

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Nebraska

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Nevada:

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')

New Hampshire

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

New Jersey

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

New Mexico

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

New York

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')

North Carolina

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Ohio

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Oklahoma

Text(0, 0.5, 'No. Patients')

Oregon

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Pennsylvania

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Rhode Island

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

South Carolina

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, '% Infection Rate')

South Dakota

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Tennessee

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Texas

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')

Utah

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')

Vermont

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Virginia

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Washington

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

West Virginia

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Wisconsin

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Wyoming

Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Patients')
Text(0, 0.5, 'No. Killed')

Assessing Correlation of Independent Variables

# TODO add some explanation / look more into collinear variables
# Heatmap of correlations
# Save correlations to variable
corr = covid_cleaned.corr(method='pearson')
# We can create a mask to not show duplicate values
mask = np.triu(np.ones_like(corr, dtype=np.bool))
# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(16,16))

# Generate heatmap
sns.heatmap(corr, annot=True, mask=mask, cmap='GnBu', center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
<matplotlib.axes._subplots.AxesSubplot at 0x14e2aa60a48>

Build model for dependent Variable

  • To be used to predict current hospitalizations
  • Having more complete variables for in ICU currently and on Ventilator Currently will allow us to predict these numbers as well.
# We compare three models:
# - Polynomial Regression
# - Linear Regression
# - ElasticNet

# Copy DFs to not mess up original one
# We will use model_df for our regression model
model_df = all_cases.copy()

# Delete redundant rows
for row in ['abbrev', 'bedsPerThousand', 'hospitalized', 
'state', 'hospitalizedCumulative', 'dataQualityGrade', 'lastUpdateEt']:
    del model_df[row]

# Drop NaN values for hospitalizedCurrently
model_df = model_df.dropna(subset=['hospitalizedCurrently'])

# Drop Values with abnormal active-hospitalised ratios (outside Conf. Interval)
model_df['ratio_hospital'] = model_df['hospitalizedCurrently'] / model_df['active']
model_df = model_df[~(model_df['ratio_hospital'] >= model_df.ratio_hospital.quantile(0.99))]

#model_df = model_df[~(model_df['ratio_hospital'] <= model_df['ratio_hospital'].median())]
del model_df['ratio_hospital']

# Get peek of model to use
model_df.describe()
population positive active hospitalizedCurrently inIcuCurrently onVentilatorCurrently recovered death totalTestsViral positiveTestsViral negativeTestsViral positiveCasesViral commercialScore negativeRegularScore negativeScore positiveScore score grade total_beds
count 4158.000 4158.000 4158.000 4158.000 2079.000 1810.000 4158.000 4158.000 1398.000 467.000 476.000 2993.000 4158.000 4158.000 4158.000 4158.000 4158.000 0.000 4158.000
mean 6582944.299 32875.871 29557.018 967.040 422.958 208.493 8599.506 1718.609 427436.422 27536.842 278249.819 37362.271 0.000 0.000 0.000 0.000 0.000 nan 15713.069
std 7551892.416 57744.789 52671.786 1858.455 697.439 318.142 16350.992 3620.500 610511.224 27402.108 259926.209 61846.425 0.000 0.000 0.000 0.000 0.000 nan 16166.658
min 567025.000 115.000 113.000 1.000 1.000 0.000 0.000 0.000 2857.000 407.000 8648.000 396.000 0.000 0.000 0.000 0.000 0.000 nan 1318.928
25% 1778070.000 3364.500 3084.750 102.250 74.000 31.000 144.000 91.000 77761.750 4163.500 67887.000 6394.000 0.000 0.000 0.000 0.000 0.000 nan 3773.952
50% 4499692.000 13051.500 11286.500 390.000 164.000 90.000 1709.000 470.000 225284.000 15192.000 199606.000 17031.000 0.000 0.000 0.000 0.000 0.000 nan 11557.920
75% 7797095.000 36704.500 31926.500 914.750 417.000 221.750 7610.500 1535.750 527436.250 48150.000 399926.000 41546.000 0.000 0.000 0.000 0.000 0.000 nan 19124.737
max 39937489.000 397649.000 553611.000 18825.000 5225.000 2425.000 103782.000 24913.000 4793353.000 91153.000 1086094.000 397649.000 0.000 0.000 0.000 0.000 0.000 nan 71887.480
###@rygomez